'''RDD数据计算练习案例
完成指定文件中统计单词个数？'''

from pyspark import SparkConf, SparkContext
# 使用os 配置python安装位置， 帮助spark找到python;
import os
os.environ['PYSPARK_PYTHON'] = 'D:\\yfxdeve\\python\Python39\\python.exe'

conf = SparkConf().setMaster("local[*]").setAppName("test_park")
sc = SparkContext(conf=conf)


# 案例开始
# 读取指定文件
rdd1 = sc.textFile("D:\\hxy\\test\\测试.txt")
print(rdd1.collect())   # ['hellolkjasldjfl', 'lksjdf flajs lkjsd ', 'lkjsd flajs lkjsd ', 'jsakldfjksla lkjsfd lkjsd flajs flajs lsdjf!!']

rdd2 = rdd1.flatMap(lambda x: x.split(" "))
print(rdd2.collect())  # ['hellolkjasldjfl', 'lksjdf', 'flajs', 'lkjsd', '', 'lkjsd', 'flajs', 'lkjsd', '', 'jsakldfjksla', 'lkjsfd', 'lkjsd', 'flajs', 'flajs', 'lsdjf!!']

rdd3 = rdd2.map(lambda x : (x, 1))
print(rdd3.collect())  # [('hellolkjasldjfl', 1), ('lksjdf', 1), ('flajs', 1), ('lkjsd', 1), ('', 1), ('lkjsd', 1), ('flajs', 1), ('lkjsd', 1), ('', 1), ('jsakldfjksla', 1), ('lkjsfd', 1), ('lkjsd', 1), ('flajs', 1), ('flajs', 1), ('lsdjf!!', 1)]

rdd4 = rdd3.reduceByKey(lambda a, b: a+b)
print(rdd4.collect()) # [('hellolkjasldjfl', 1), ('lksjdf', 1), ('flajs', 4), ('lkjsd', 4), ('', 2), ('jsakldfjksla', 1), ('lkjsfd', 1), ('lsdjf!!', 1)]
# 案例结束

# 关闭
sc.stop()

